/*
 * WebSPHINX web crawling toolkit
 * Copyright (C) 1998,1999 Carnegie Mellon University 
 * 
 * This library is free software; you can redistribute it
 * and/or modify it under the terms of the GNU Library
 * General Public License as published by the Free Software 
 * Foundation, version 2.
 *
 * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/
 */
package websphinx;

/**
 * Standard classifier, installed in every crawler by default.
 * <P>On the entire page, this classifier sets the following labels:
 * <UL>
 * <LI><B>root</B>: page is the root page of a Web site.  For instance,
 *     "http://www.digital.com/" and "http://www.digital.com/index.html" are both
 *     marked as root, but "http://www.digital.com/about" is not.
 * </UL>
 * <P>Also sets one or more of the following labels on every link:
 * <UL>
 * <LI><B>hyperlink</B>: link is a hyperlink (A, AREA, or FRAME tags) to another page on the Web (using http, file, ftp, or gopher protocols)
 * <LI><B>image</B>: link is an inline image (IMG).
 * <LI><B>form</B>: link is a form (FORM tag).  A form generally requires some parameters to use.
 * <LI><B>code</B>: link points to code (APPLET, EMBED, or SCRIPT).
 * <LI><B>remote</B>: link points to a different Web server.
 * <LI><B>local</B>: link points to the same Web server.
 * <LI><B>same-page</B>: link points to the same page (e.g., by an anchor reference like "#top")
 * <LI><B>sibling</B>: a local link that points to a page in the same directory (e.g. "sibling.html")
 * <LI><B>descendent</B>: a local link that points downwards in the directory structure (e.g., "deep/deeper/deepest.html")
 * <LI><B>ancestor</B>: a link that points upwards in the directory structure (e.g., "../..")
 * </UL>
 */
public class StandardClassifier implements Classifier  {

    /**
     * Make a StandardClassifier.
     */
    public StandardClassifier () {
    }

    /** 
     * Classify a page.
     * @param page Page to classify
     */
    // FIX: use regular expressions throughout this method
    public void classify (Page page) {
        Link origin = page.getOrigin ();
        String pagePath = origin.getFile();
        String pageFilename = origin.getFilename();
        String pageDir = origin.getDirectory ();

        if (pageFilename.equals ("") || pageFilename.startsWith ("index.htm"))
            page.setLabel ("root");

        // FIX: Link needs to resolve "foo/bar/.." and "foo/." to "foo" in order for this
        // stuff to work properly
        Link[] links = page.getLinks ();
        if (links != null) {
            for (int i=0; i<links.length; ++i) {
                Link link = links[i];
                
                if (link.getHost().equals (origin.getHost()) && link.getPort() == origin.getPort()) {
                    link.setLabel ("local");
                    
                    String linkPath = link.getFile ();
                    String linkDir = link.getDirectory ();
                    
                    if (linkPath.equals (pagePath))
                    link.setLabel ("same-page");
                    else if (linkDir.equals (pageDir))
                    link.setLabel ("sibling");
                    else if (linkDir.startsWith (pageDir))
                    link.setLabel ("descendent");
                    else if (pageDir.startsWith (linkDir))
                    link.setLabel ("ancestor");
                    // NIY: child, parent
                }
                else
                    link.setLabel ("remote");

                // Link tag kinds: resource, form, hyperlink
                String tagName = link.getTagName();
                
                if (tagName == Tag.IMG)
                    link.setLabel ("image");
                else if (tagName == Tag.APPLET || tagName == Tag.EMBED || tagName == Tag.SCRIPT)
                    link.setLabel ("code");
                else if (tagName == Tag.FORM)
                    link.setLabel ("form");
                else if (tagName == Tag.A || tagName == Tag.AREA || tagName == Tag.FRAME) {
                    String protocol = link.getProtocol ();
                    
                    if ((protocol.equals ("http")
                         || protocol.equals ("ftp")
                         || protocol.equals ("file")
                         || protocol.equals ("gopher"))
                        && link.getMethod() == Link.GET)
                        link.setLabel ("hyperlink");
                }
            }
        }
    }

    /**
     * Priority of this classifier.
     */
    public static final float priority = 0.0F;
    
    /**
     * Get priority of this classifier.
     * @return priority.
     */
    public float getPriority () {
        return priority;
    }
}
